cluster_name: {{cluster_name_on_cloud}}

# The maximum number of workers nodes to launch in addition to the head node.
max_workers: {{num_nodes - 1}}
upscaling_speed: {{num_nodes - 1}}
idle_timeout_minutes: 60

{%- if docker_image is not none %}
docker:
  image: {{docker_image}}
  container_name: {{docker_container_name}}
  run_options:
    - --ulimit nofile=1048576:1048576
    {%- for run_option in docker_run_options %}
    - {{run_option}}
    {%- endfor %}
  {%- if docker_login_config is not none %}
  docker_login_config:
    username: |-
      {{docker_login_config.username}}
    password: |-
      {{docker_login_config.password | indent(6) }}
    server: |-
      {{docker_login_config.server}}
  {%- endif %}
{%- endif %}

provider:
  type: external
  module: sky.provision.azure
  location: {{region}}
  # Ref: https://github.com/ray-project/ray/blob/2367a2cb9033913b68b1230316496ae273c25b54/python/ray/autoscaler/_private/_azure/node_provider.py#L87
  # For Azure, ray distinguishes different instances by the resource_group,
  # instead of the cluster_name. This ensures that ray creates new instances
  # for different cluster_name.
  resource_group: {{resource_group}}
  use_external_resource_group: {{use_external_resource_group}}
  # Keep (otherwise cannot reuse when re-provisioning).
  # teardown(terminate=True) will override this.
  cache_stopped_nodes: True
  # subscription id of the azure user
  subscription_id: {{azure_subscription_id}}
  # Disable launch config check for worker nodes as it can cause resource
  # leakage.
  # Reference: https://github.com/ray-project/ray/blob/cd1ba65e239360c8a7b130f991ed414eccc063ce/python/ray/autoscaler/_private/autoscaler.py#L1115
  # The upper-level SkyPilot code has make sure there will not be resource
  # leakage.
  disable_launch_config_check: true


auth:
    ssh_user: azureuser
    ssh_private_key: {{ssh_private_key}}

available_node_types:
    ray.head.default:
      resources: {}
      node_config:
        tags:
          skypilot-user: {{ user }}
        azure_arm_parameters:
            adminUsername: skypilot:ssh_user
            publicKey: |
              skypilot:ssh_public_key_content
            vmSize: {{instance_type}}
            # List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
            imagePublisher: {{image_publisher}}
            imageOffer: {{image_offer}}
            imageSku: "{{image_sku}}"
            imageVersion: {{image_version}}
            # Community Gallery Image ID
            communityGalleryImageId: {{community_gallery_image_id}}
            osDiskSizeGB: {{disk_size}}
            osDiskTier: {{disk_tier}}
            {%- if use_spot %}
            # optionally set priority to use Spot instances
            priority: Spot
            {%- endif %}
            cloudInitSetupCommands: |-
              {%- for cmd in cloud_init_setup_commands %}
              {{ cmd }}
              {%- endfor %}
        {%- if disk_performance_tier is not none %}
        disk_performance_tier: {{disk_performance_tier}}
        {%- endif %}
        # TODO: attach disk

head_node_type: ray.head.default

# Format: `REMOTE_PATH : LOCAL_PATH`
file_mounts: {
  "{{sky_ray_yaml_remote_path}}": "{{sky_ray_yaml_local_path}}",
  "{{sky_remote_path}}/{{sky_wheel_hash}}": "{{sky_local_path}}",
{%- for remote_path, local_path in credentials.items() %}
  "{{remote_path}}": "{{local_path}}",
  "~/.ssh/sky-cluster-key": "{{ssh_private_key}}",
{%- endfor %}
}


# List of shell commands to run to set up nodes.
# NOTE: these are very performance-sensitive. Each new item opens/closes an SSH
# connection, which is expensive. Try your best to co-locate commands into fewer
# items!
#
# Increment the following for catching performance bugs easier:
#   current num items (num SSH connections): 1
setup_commands:
  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
  # Line 'sudo systemctl stop jupyter ..': stop jupyter service to avoid port conflict on 8888
  # Line 'sudo systemctl stop jupyterhub ..': stop jupyterhub service to avoid port conflict on 8081
  # Line 'mkdir -p ..': disable host key check
  # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
  # This also kills the service that is holding the lock on dpkg (problem only exists on aws/azure, not gcp)
  # Line 'sudo mv /etc/nccl.conf /etc/nccl.conf.bak' removes the default nccl.conf which is wrongly configured on many multi-GPU Azure VM, causing failure for multi-GPU workloads using NCCL.
  - mkdir -p ~/.ssh; touch ~/.ssh/config;
    {{ conda_installation_commands }}
    {{ ray_skypilot_installation_commands }}
    {{ copy_skypilot_templates_commands }}
    touch ~/.sudo_as_admin_successful;
    sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
    {%- if docker_image is none %}
    sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
    sudo systemctl stop jupyter > /dev/null 2>&1 || true;
    sudo systemctl disable jupyter > /dev/null 2>&1 || true;
    sudo systemctl stop jupyterhub > /dev/null 2>&1 || true;
    sudo systemctl disable jupyterhub > /dev/null 2>&1 || true;
    {%- endif %}
    mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa" ~/.ssh/config) || printf "Host *\n  StrictHostKeyChecking no\n  IdentityFile ~/.ssh/sky-cluster-key\n  IdentityFile ~/.ssh/id_rsa\n" >> ~/.ssh/config;
    [ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf');
    sudo mv /etc/nccl.conf /etc/nccl.conf.bak || true;
    {{ ssh_max_sessions_config }}
